library(xgboost)
library(randomForest)
library(tidyverse)
library(lubridate)

source('functions.r')
load("Table_construction.Rdata")

General Recidivism

Generic stuff (applies to all models)

### Add useful columns to features and apply row filters used for all models
features_filt = features %>%
  inner_join(
    data_before %>% 
      select(person_id, screening_date, people) %>%
      unnest() %>%
      select(person_id, screening_date, race, sex, name),
    by = c("person_id","screening_date")
  ) %>%
  inner_join(features_on, by = c("person_id","screening_date")) %>%
  inner_join(outcomes, by = c("person_id","screening_date")) %>%
  filter(`Risk of Recidivism_decile_score` != -1, `Risk of Violence_decile_score` != -1) %>% # Filter 1
  filter(!is.na(current_offense_date)) %>% # Filter 3
  mutate(p_recid_raw = `Risk of Recidivism_raw_score`,
         age_poly = 0.000492285131636128000*p_current_age^2 - 0.0775341320826139000*p_current_age + 0.0305011936304372000,
         p_recid_raw_noage = p_recid_raw - age_poly)

## Set parameters (each combination will be run)
# xgboost
param <- list(objective = "reg:linear",
              eval_metric = "rmse",
              eta = c(.05,.1),
              gamma = c(.5, 1), 
              max_depth = c(2,5),
              min_child_weight = c(5,10),
              subsample = c(1),
              colsample_bytree = c(1)
)

# svm
param_svm = list(
  type = 'eps-regression',
  cost = c(0.5,1,2),
  epsilon = c(0.5,1,1.5),
  gamma_scale = c(0.5,1,2)
)
res_rmse = data.frame(Group = 1:5, lm = NA, xgb = NA, rf = NA, svm = NA)
## Age polynomial
features_filt %>%
  ggplot()+
  geom_point(aes(x=p_current_age, p_recid_raw,color="b"), alpha=.3) +
  geom_line(aes(x=p_current_age, age_poly,color="a")) +
  theme_bw()+
  xlim(18,70)+
  xlab("Age at COMPAS screening date") +
  ylab("COMPAS general raw") +
  theme(
        text = element_text(size=12),
        axis.text=element_text(size=12),
        legend.position="none")
## Warning: Removed 19 rows containing missing values (geom_point).
## Warning: Removed 19 rows containing missing values (geom_path).

ggsave("Figures/age_agePoly_general.pdf",width = 3.5, height = 2.5, units = "in")
## Warning: Removed 19 rows containing missing values (geom_point).
## Warning: Removed 19 rows containing missing values (geom_path).
### Number of priors vs. COMPAS remainder
ggplot(data=features_filt) +
  geom_point(aes(x=p_charge, y=p_recid_raw_noage), alpha=.3)+
  theme_bw() +
  xlab("Number of prior charges") +
  ylab("COMPAS general raw remainder") +
  theme(
        text = element_text(size=12),
        axis.text=element_text(size=12))+
  xlim(0,60)
## Warning: Removed 11 rows containing missing values (geom_point).

ggsave("Figures/priors_rawScoreRemain_general.pdf",width = 3.5, height = 3.5, units = "in")
## Warning: Removed 11 rows containing missing values (geom_point).

Replicating ProPublica logistic regression

propub = features_filt %>%
  filter(screening_date <= current_offense_date_limit) %>% # Only people with valid recidivism values
  mutate(age_low = if_else(p_current_age < 25,1,0), 
         age_high = if_else(p_current_age > 45,1,0),
         female = if_else(sex=="Female",1,0),
         n_priors = p_felony_count_person + p_misdem_count_person,
         compas_high = if_else(`Risk of Recidivism_decile_score` >= 5, 1, 0), # Medium and High risk scores get +1 label
         race = relevel(factor(race), ref="Caucasian")) # Base level is Caucasian, as in ProPublica analysis
mdl_glm = glm(compas_high ~
                female +
                age_high +
                age_low +
                as.factor(race) +
                p_charge +
                is_misdem +
                recid,
                family=binomial(link='logit'), data=propub)

summary(mdl_glm)
## 
## Call:
## glm(formula = compas_high ~ female + age_high + age_low + as.factor(race) + 
##     p_charge + is_misdem + recid, family = binomial(link = "logit"), 
##     data = propub)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.7558  -0.7641  -0.3055   0.8402   2.6712  
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     -1.604475   0.082621 -19.420  < 2e-16 ***
## female                           0.129382   0.085337   1.516   0.1295    
## age_high                        -1.485491   0.130168 -11.412  < 2e-16 ***
## age_low                          1.443967   0.071469  20.204  < 2e-16 ***
## as.factor(race)African-American  0.522728   0.073118   7.149 8.73e-13 ***
## as.factor(race)Asian            -0.270324   0.504164  -0.536   0.5918    
## as.factor(race)Hispanic         -0.307350   0.131763  -2.333   0.0197 *  
## as.factor(race)Native American   0.387967   0.678561   0.572   0.5675    
## as.factor(race)Other            -0.720007   0.160307  -4.491 7.08e-06 ***
## p_charge                         0.156196   0.006582  23.731  < 2e-16 ***
## is_misdem                       -0.449143   0.069824  -6.433 1.26e-10 ***
## recid                            0.497513   0.068974   7.213 5.47e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7864.7  on 5726  degrees of freedom
## Residual deviance: 5633.5  on 5715  degrees of freedom
## AIC: 5657.5
## 
## Number of Fisher Scoring iterations: 5

Group 1 models: predicting (raw score - age polynomial) without using age variables or race

### Create group 1 training data

## Select features and round count features
train = features_filt %>%
  transmute(
    #p_current_age,
    p_age_first_offense,
    p_charge,
    p_jail30 = pmin(p_jail30,5),
    p_prison = pmin(p_prison,5),
    p_probation = pmin(p_probation,5),
    p_recid_raw_noage)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-p_recid_raw_noage) %>% as.matrix(),
  "label" = train %>% select(p_recid_raw_noage) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(p_recid_raw_noage ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = p_recid_raw_noage ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.67940 -0.45225 -0.06694  0.37222  2.54441 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.0555467  0.0177396   59.50   <2e-16 ***
## p_age_first_offense -0.0067863  0.0005581  -12.16   <2e-16 ***
## p_charge             0.0256917  0.0010352   24.82   <2e-16 ***
## p_jail30            -0.0073008  0.0404757   -0.18    0.857    
## p_prison             0.2084061  0.0085096   24.49   <2e-16 ***
## p_probation          0.1179271  0.0074736   15.78   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5759 on 9036 degrees of freedom
## Multiple R-squared:  0.4061, Adjusted R-squared:  0.4058 
## F-statistic:  1236 on 5 and 9036 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==1,]$lm = rmse(predict(mdl_lm, newdata=train), train$p_recid_raw_noage) # ADJUST GROUP

Model 2: xgboost

set.seed(923)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  6           
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.1"       
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "5"         
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$p_recid_raw_noage

res_rmse[res_rmse$Group==1,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab("COMPAS remainder") +
  ylab("xgboost prediction")+
  theme(
        text = element_text(size=14),
        axis.text=element_text(size=14))

### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

Model 3: random forest

set.seed(784)

mdl_rf = randomForest(
  formula = p_recid_raw_noage ~ .,
  data = train
)

res_rmse[res_rmse$Group==1,]$rf = rmse(mdl_rf$predicted, train$p_recid_raw_noage) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(p_recid_raw_noage ~ ., train, param_svm)
## Training on 27 sets of parameters.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.
## [1] "Best parameters:"
##             19              
## type        "eps-regression"
## cost        "0.5"           
## epsilon     "0.5"           
## gamma_scale "2"             
## gamma       "0.3333333"
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.
res_rmse[res_rmse$Group==1,]$svm = rmse(mdl_svm$fitted, train$p_recid_raw_noage) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf, mdl_svm)

Group 2 models: predicting (raw score - age polynomial) without using age variables but with race

### Create group 2 training data

## Select features and round count features
train = features_filt %>%
  transmute(
    #p_current_age,
    p_age_first_offense,
    p_charge,
    p_jail30 = pmin(p_jail30,5),
    p_prison = pmin(p_prison,5),
    p_probation = pmin(p_probation,5),
    race_black = if_else(race=="African-American",1,0),
    race_white = if_else(race=="Caucasian",1,0),
    race_hispanic = if_else(race=="Hispanic",1,0),
    race_asian = if_else(race=="Asian",1,0),
    race_native = if_else(race=="Native American",1,0), # race == "Other" is the baseline
    p_recid_raw_noage)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-p_recid_raw_noage) %>% as.matrix(),
  "label" = train %>% select(p_recid_raw_noage) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(p_recid_raw_noage ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = p_recid_raw_noage ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.50492 -0.43903 -0.06279  0.36448  2.39305 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.7336356  0.0298015  24.617  < 2e-16 ***
## p_age_first_offense -0.0048067  0.0005678  -8.466  < 2e-16 ***
## p_charge             0.0248022  0.0010184  24.355  < 2e-16 ***
## p_jail30             0.0065670  0.0397765   0.165  0.86887    
## p_prison             0.1984432  0.0083906  23.651  < 2e-16 ***
## p_probation          0.1147516  0.0073457  15.622  < 2e-16 ***
## race_black           0.3692502  0.0257204  14.356  < 2e-16 ***
## race_white           0.2448093  0.0259935   9.418  < 2e-16 ***
## race_hispanic        0.0864109  0.0311533   2.774  0.00555 ** 
## race_asian           0.0919404  0.0859397   1.070  0.28473    
## race_native          0.2592508  0.1096961   2.363  0.01813 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5657 on 9031 degrees of freedom
## Multiple R-squared:  0.4273, Adjusted R-squared:  0.4267 
## F-statistic: 673.8 on 10 and 9031 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==2,]$lm = rmse(predict(mdl_lm, newdata=train), train$p_recid_raw_noage) # ADJUST GROUP

Model 2: xgboost

set.seed(480)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  14          
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.1"       
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "10"        
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$p_recid_raw_noage

res_rmse[res_rmse$Group==2,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab("COMPAS raw score remainder") +
  ylab("XGBoost prediction")+
  theme(
        text = element_text(size=14),
        axis.text=element_text(size=14))

data.frame(xgboost = pred, compas=features_filt$p_recid_raw) %>%
  ggplot() +
  geom_point(aes(x=xgboost,y=compas), alpha=.3) +
  theme_bw()+
  xlab("XGBoost prediction") +
  ylab("COMPAS raw score")+
  theme(
        text = element_text(size=14),
        axis.text=element_text(size=14))

### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

Model 3: random forest

set.seed(6778)

mdl_rf = randomForest(
  formula = p_recid_raw_noage ~ .,
  data = train
)

res_rmse[res_rmse$Group==2,]$rf = rmse(mdl_rf$predicted, train$p_recid_raw_noage) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(p_recid_raw_noage ~ ., train, param_svm)
## Training on 27 sets of parameters.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.
## [1] "Best parameters:"
##             12              
## type        "eps-regression"
## cost        "2"             
## epsilon     "0.5"           
## gamma_scale "1"             
## gamma       "0.09090909"
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.
res_rmse[res_rmse$Group==2,]$svm = rmse(mdl_svm$fitted, train$p_recid_raw_noage) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)

Group 3 models: predicting (raw score - age polynomial) without using race but with age variables

### Create group 3 training data

## Select features and round count features
train = features_filt %>%
  transmute(
    p_current_age,
    p_age_first_offense,
    p_charge,
    p_jail30 = pmin(p_jail30,5),
    p_prison = pmin(p_prison,5),
    p_probation = pmin(p_probation,5),
    p_recid_raw_noage)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-p_recid_raw_noage) %>% as.matrix(),
  "label" = train %>% select(p_recid_raw_noage) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(p_recid_raw_noage ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = p_recid_raw_noage ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.26968 -0.44787 -0.07239  0.36897  2.55581 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.014740   0.018166  55.858   <2e-16 ***
## p_current_age        0.011466   0.001206   9.510   <2e-16 ***
## p_age_first_offense -0.017645   0.001270 -13.897   <2e-16 ***
## p_charge             0.022968   0.001069  21.483   <2e-16 ***
## p_jail30             0.020908   0.040386   0.518    0.605    
## p_prison             0.184507   0.008833  20.889   <2e-16 ***
## p_probation          0.096735   0.007764  12.460   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.573 on 9035 degrees of freedom
## Multiple R-squared:  0.412,  Adjusted R-squared:  0.4116 
## F-statistic:  1055 on 6 and 9035 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==3,]$lm = rmse(predict(mdl_lm, newdata=train), train$p_recid_raw_noage) # ADJUST GROUP

Model 2: xgboost

set.seed(999)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  14          
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.1"       
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "10"        
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$p_recid_raw_noage

res_rmse[res_rmse$Group==3,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab("COMPAS raw score remainder") +
  ylab("xgboost prediction")+
  theme(
        text = element_text(size=14),
        axis.text=element_text(size=14))

### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

Model 3: random forest

set.seed(5)

mdl_rf = randomForest(
  formula = p_recid_raw_noage ~ .,
  data = train
)

res_rmse[res_rmse$Group==3,]$rf = rmse(mdl_rf$predicted, train$p_recid_raw_noage) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(p_recid_raw_noage ~ ., train, param_svm)
## Training on 27 sets of parameters.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.
## [1] "Best parameters:"
##             11              
## type        "eps-regression"
## cost        "1"             
## epsilon     "0.5"           
## gamma_scale "1"             
## gamma       "0.1428571"
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.
res_rmse[res_rmse$Group==3,]$svm = rmse(mdl_svm$fitted, train$p_recid_raw_noage) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)

Group 4 models: predicting (raw score - age polynomial) using age variables and race

### Create group 2 training data

## Select features and round count features
train = features_filt %>%
  transmute(
    p_current_age,
    p_age_first_offense,
    p_charge,
    p_jail30 = pmin(p_jail30,5),
    p_prison = pmin(p_prison,5),
    p_probation = pmin(p_probation,5),
    race_black = if_else(race=="African-American",1,0),
    race_white = if_else(race=="Caucasian",1,0),
    race_hispanic = if_else(race=="Hispanic",1,0),
    race_asian = if_else(race=="Asian",1,0),
    race_native = if_else(race=="Native American",1,0), # race == "Other" is the baseline
    p_recid_raw_noage)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-p_recid_raw_noage) %>% as.matrix(),
  "label" = train %>% select(p_recid_raw_noage) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(p_recid_raw_noage ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = p_recid_raw_noage ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.09334 -0.42968 -0.06403  0.35999  2.40135 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.692957   0.029955  23.133  < 2e-16 ***
## p_current_age        0.011375   0.001187   9.583  < 2e-16 ***
## p_age_first_offense -0.015533   0.001254 -12.389  < 2e-16 ***
## p_charge             0.022096   0.001052  21.005  < 2e-16 ***
## p_jail30             0.034923   0.039688   0.880  0.37893    
## p_prison             0.174521   0.008714  20.028  < 2e-16 ***
## p_probation          0.093816   0.007629  12.298  < 2e-16 ***
## race_black           0.369928   0.025592  14.455  < 2e-16 ***
## race_white           0.238926   0.025871   9.235  < 2e-16 ***
## race_hispanic        0.093102   0.031006   3.003  0.00268 ** 
## race_asian           0.100984   0.085516   1.181  0.23768    
## race_native          0.247590   0.109155   2.268  0.02334 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5628 on 9030 degrees of freedom
## Multiple R-squared:  0.4331, Adjusted R-squared:  0.4324 
## F-statistic:   627 on 11 and 9030 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==4,]$lm = rmse(predict(mdl_lm, newdata=train), train$p_recid_raw_noage) # ADJUST GROUP

Model 2: xgboost

set.seed(23)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  5           
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.05"      
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "5"         
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$p_recid_raw_noage

res_rmse[res_rmse$Group==4,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab("COMPAS general raw remainder") +
  ylab("Prediction of COMPAS general raw remainder")+
  theme(
        text = element_text(size=12),
        axis.text=element_text(size=12))

ggsave("Figures/rawScoreRemain_xgboost_general.pdf",width = 4, height = 4, units = "in")
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

highlight = data.frame(
  person_id= c(799, 1284, 1394, 1497, 1515, 1638, 3145, 3291, 5722, 6337, 6886, 7997, 8200, 8375, 8491, 10553, 10774, 11231, 11312, 11414),
  screening_date = ymd(c("2014-06-15","2014-05-14","2014-11-28","2013-07-29","2013-10-23","2013-10-04","2014-12-14","2013-01-17","2013-10-24","2014-02-04","2013-07-12","2014-04-26","2014-05-05","2013-03-19","2014-01-18","2014-09-20","2013-04-09","2014-02-23","2014-05-02","2014-11-26")),
  highlight = TRUE
)

df_plot = features_filt %>%
  bind_cols(xgboost = predict(mdl_xgb, newdata=train_xgb)) %>%
  left_join(highlight, by = c("person_id","screening_date")) %>%
  mutate(highlight = if_else(is.na(highlight), FALSE, TRUE)) %>%
  mutate(highlight = factor(if_else(highlight==TRUE,"In Table 5", "Not in Table 5"), levels=c("In Table 5", "Not in Table 5")))

person_id_text_topright = c(8375, 11231, 1515)
#person_id_text_topright = highlight$person_id
person_id_text_topleft = c(1394, 1497)
person_id_text_botright = c(11312, 6886, 8491, 10774)
person_id_text_botleft = c(799)

ggplot() +
  geom_point(aes(x=xgboost,y=p_recid_raw, color=highlight),  alpha = .3, data = filter(df_plot, highlight=="Not in Table 5")) +
  geom_point(aes(x=xgboost,y=p_recid_raw, color=highlight),  data = filter(df_plot, highlight=="In Table 5")) +
  theme_bw()+
  geom_text(aes(x=xgboost,y=p_recid_raw,label=name),size=3,nudge_x=0, nudge_y=0, hjust="left",vjust="bottom", data=filter(df_plot, person_id %in% person_id_text_topright & highlight=="In Table 5")) + 
  geom_text(aes(x=xgboost,y=p_recid_raw,label=name),size=3,nudge_x=0, nudge_y=0, hjust="right",vjust="bottom", data=filter(df_plot, person_id %in% person_id_text_topleft & highlight=="In Table 5")) + 
  geom_text(aes(x=xgboost,y=p_recid_raw,label=name),size=3,nudge_x=0, nudge_y=0, hjust="left",vjust="top", data=filter(df_plot, person_id %in% person_id_text_botright & highlight=="In Table 5")) + 
  geom_text(aes(x=xgboost,y=p_recid_raw,label=name),size=3,nudge_x=0, nudge_y=0, hjust="right",vjust="top", data=filter(df_plot, person_id %in% person_id_text_botleft & highlight=="In Table 5")) + 
  xlab("Prediction of COMPAS general raw remainder") +
  ylab("COMPAS general raw")+
  theme(
    text = element_text(size=12),
    axis.text=element_text(size=12),
    #legend.position = "top",
    legend.position="none") +
  scale_color_discrete(name = element_blank()) +
  xlim(0.2,3.5)
## Warning: Removed 2 rows containing missing values (geom_point).

ggsave("Figures/xgboost_rawScore_general.pdf",width = 4, height = 4, units = "in")
## Warning: Removed 2 rows containing missing values (geom_point).

Model 3: random forest

set.seed(3720)

mdl_rf = randomForest(
  formula = p_recid_raw_noage ~ .,
  data = train
)

res_rmse[res_rmse$Group==4,]$rf = rmse(mdl_rf$predicted, train$p_recid_raw_noage) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(p_recid_raw_noage ~ ., train, param_svm)
## Training on 27 sets of parameters.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.
## [1] "Best parameters:"
##             20              
## type        "eps-regression"
## cost        "1"             
## epsilon     "0.5"           
## gamma_scale "2"             
## gamma       "0.1666667"
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.
res_rmse[res_rmse$Group==4,]$svm = rmse(mdl_svm$fitted, train$p_recid_raw_noage) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)

Group 5 models: test

### Create group 5 training data

## Select features and round count features
train = features_filt %>%
  transmute(
    p_current_age,
    p_age_first_offense,
    p_charge,
    p_arrest,
    p_jail30 = pmin(p_jail30,5),
    p_prison30 = pmin(p_jail30,5),
    p_prison = pmin(p_prison,5),
    p_probation = pmin(p_probation,5),
    race_black = if_else(race=="African-American",1,0),
    race_white = if_else(race=="Caucasian",1,0),
    race_hispanic = if_else(race=="Hispanic",1,0),
    race_asian = if_else(race=="Asian",1,0),
    race_native = if_else(race=="Native American",1,0), # race == "Other" is the baseline
    p_recid_raw_noage)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-p_recid_raw_noage) %>% as.matrix(),
  "label" = train %>% select(p_recid_raw_noage) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(p_recid_raw_noage ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = p_recid_raw_noage ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9676 -0.4323 -0.0629  0.3625  2.4067 
## 
## Coefficients: (1 not defined because of singularities)
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.695934   0.029936  23.247  < 2e-16 ***
## p_current_age        0.011899   0.001192   9.978  < 2e-16 ***
## p_age_first_offense -0.016087   0.001260 -12.772  < 2e-16 ***
## p_charge             0.014255   0.002143   6.651 3.08e-11 ***
## p_arrest             0.006877   0.001638   4.197 2.73e-05 ***
## p_jail30             0.023470   0.039746   0.591   0.5549    
## p_prison30                 NA         NA      NA       NA    
## p_prison             0.170361   0.008762  19.443  < 2e-16 ***
## p_probation          0.081617   0.008157  10.006  < 2e-16 ***
## race_black           0.371083   0.025570  14.512  < 2e-16 ***
## race_white           0.240438   0.025850   9.301  < 2e-16 ***
## race_hispanic        0.095303   0.030982   3.076   0.0021 ** 
## race_asian           0.101744   0.085438   1.191   0.2337    
## race_native          0.242682   0.109061   2.225   0.0261 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5623 on 9029 degrees of freedom
## Multiple R-squared:  0.4342, Adjusted R-squared:  0.4334 
## F-statistic: 577.3 on 12 and 9029 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==5,]$lm = rmse(predict(mdl_lm, newdata=train), train$p_recid_raw_noage) # ADJUST GROUP
## Warning in predict.lm(mdl_lm, newdata = train): prediction from a rank-
## deficient fit may be misleading

Model 2: xgboost

set.seed(480)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  8           
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.1"       
## gamma            "1"         
## max_depth        "5"         
## min_child_weight "5"         
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$p_recid_raw_noage

res_rmse[res_rmse$Group==5,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab("COMPAS raw score remainder") +
  ylab("xgboost prediction")+
  theme(
        text = element_text(size=14),
        axis.text=element_text(size=14))

### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

Model 3: random forest

set.seed(1123)

mdl_rf = randomForest(
  formula = p_recid_raw_noage ~ .,
  data = train
)

res_rmse[res_rmse$Group==5,]$rf = rmse(mdl_rf$predicted, train$p_recid_raw_noage) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(p_recid_raw_noage ~ ., train, param_svm)
## Training on 27 sets of parameters.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.
## [1] "Best parameters:"
##             12              
## type        "eps-regression"
## cost        "2"             
## epsilon     "0.5"           
## gamma_scale "1"             
## gamma       "0.07142857"
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.
res_rmse[res_rmse$Group==5,]$svm = rmse(mdl_svm$fitted, train$p_recid_raw_noage) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)

Comparison

knitr::kable(res_rmse)
Group lm xgb rf svm
1 0.5756646 0.5228469 0.5553384 0.5295423
2 0.5653175 0.5128263 0.5276490 0.5213844
3 0.5728047 0.5171593 0.5330514 0.5260242
4 0.5624647 0.5058171 0.5244834 0.5146050
5 0.5619167 0.4971066 0.5141074 0.5097631